suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/4Datasets_v6.csv",
col_types = cols(col_character(),
col_character(),
col_factor(),
col_character(),
col_factor(),
col_character(),
col_double(),
col_double(),
col_factor())) %>%
mutate(Tissue=factor(Tissue)) %>%
mutate(Time=factor(Time)) %>%
mutate(Experiment=factor(Experiment))
##need to delete User.ID, Sample.ID, Replicate, Mreads and X..Q30 (columns 2,4,6,7,8)
samples <- subset(samples, select = -c(User.ID, Sample.ID, Replicate, Mreads, X..Q30) )
samples$Experiment
## [1] Zygotic Embryogenesis Zygotic Embryogenesis
## [3] Zygotic Embryogenesis Zygotic Embryogenesis
## [5] Zygotic Embryogenesis Zygotic Embryogenesis
## [7] Zygotic Embryogenesis Zygotic Embryogenesis
## [9] Zygotic Embryogenesis Zygotic Embryogenesis
## [11] Zygotic Embryogenesis Zygotic Embryogenesis
## [13] Zygotic Embryogenesis Zygotic Embryogenesis
## [15] Zygotic Embryogenesis Zygotic Embryogenesis
## [17] Zygotic Embryogenesis Zygotic Embryogenesis
## [19] Zygotic Embryogenesis Zygotic Embryogenesis
## [21] Zygotic Embryogenesis Zygotic Embryogenesis
## [23] Zygotic Embryogenesis Zygotic Embryogenesis
## [25] Zygotic Embryogenesis Zygotic Embryogenesis
## [27] Zygotic Embryogenesis Zygotic Embryogenesis
## [29] Zygotic Embryogenesis Zygotic Embryogenesis
## [31] Zygotic Embryogenesis Zygotic Embryogenesis
## [33] Zygotic Embryogenesis Zygotic Embryogenesis
## [35] Zygotic Embryogenesis Zygotic Embryogenesis
## [37] Zygotic Embryogenesis Zygotic Embryogenesis
## [39] Zygotic Embryogenesis Zygotic Embryogenesis
## [41] Zygotic Embryogenesis Zygotic Embryogenesis
## [43] Zygotic Embryogenesis Zygotic Embryogenesis
## [45] Zygotic Embryogenesis Zygotic Embryogenesis
## [47] Zygotic Embryogenesis Zygotic Embryogenesis
## [49] Zygotic Embryogenesis Zygotic Embryogenesis
## [51] Zygotic Embryogenesis Zygotic Embryogenesis
## [53] Zygotic Embryogenesis Zygotic Embryogenesis
## [55] Zygotic Embryogenesis Zygotic Embryogenesis
## [57] Zygotic Embryogenesis 29Seed
## [59] 29Seed 29Seed
## [61] 29Seed 29Seed
## [63] 29Seed 29Seed
## [65] 29Seed 29Seed
## [67] 29Seed 29Seed
## [69] 29Seed Somatic Embryogenesis Germinants
## [71] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [73] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [75] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [77] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [79] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [81] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [83] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [85] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [87] Somatic Embryogenesis Germinants Somatic Embryogenesis Germinants
## [89] Somatic Embryogenesis Somatic Embryogenesis
## [91] Somatic Embryogenesis Somatic Embryogenesis
## [93] Somatic Embryogenesis Somatic Embryogenesis
## [95] Somatic Embryogenesis Somatic Embryogenesis
## [97] Somatic Embryogenesis Somatic Embryogenesis
## [99] Somatic Embryogenesis Somatic Embryogenesis
## [101] Somatic Embryogenesis Somatic Embryogenesis
## [103] Somatic Embryogenesis Somatic Embryogenesis
## [105] Somatic Embryogenesis Somatic Embryogenesis
## [107] Somatic Embryogenesis Somatic Embryogenesis
## [109] Somatic Embryogenesis Somatic Embryogenesis
## [111] Somatic Embryogenesis Somatic Embryogenesis
## [113] Somatic Embryogenesis Somatic Embryogenesis
## [115] Somatic Embryogenesis Somatic Embryogenesis
## [117] Somatic Embryogenesis Somatic Embryogenesis
## [119] Somatic Embryogenesis Somatic Embryogenesis
## [121] Somatic Embryogenesis Somatic Embryogenesis
## 4 Levels: Zygotic Embryogenesis ... Somatic Embryogenesis
sZE <- subset(samples,samples$Experiment == "Zygotic Embryogenesis")
s29 <- subset(samples,samples$Experiment == "29Seed")
sSG <- subset(samples,samples$Experiment == "Somatic Embryogenesis Germinants")
sSE <- subset(samples,samples$Experiment == "Somatic Embryogenesis")
sSE$Time <- sSE$Tissue
sSE$Time <- str_c("B",sSE$Time)
sSE$Tissue <- c("SE")
samples <- bind_rows(sZE,s29,sSG,sSE)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
samples$Tissue <- factor(samples$Tissue)
lb.filelistZE <- list.files(here("data/RNA-Seq/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
##ZE is missing 204 in L002, and need to remove P11562_112 from both
lb.filelistZE <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L00", negate = TRUE))
lb.filterlistZEuniq <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_204"))
lb.filterlistZE <- str_subset(lb.filelistZE,"L001_sortmerna_trimmomatic")
lb.filelistZEWorking <- (str_subset(lb.filelistZE, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_204", negate = TRUE))
lb.filterlistZE1 <- str_subset(lb.filelistZEWorking,"L001_sortmerna_trimmomatic")
names(lb.filterlistZE1) <- sapply(lapply(strsplit(lb.filterlistZE1,"_"),"[",1:2),paste,collapse="_")
lb.filterlistZE2 <- str_subset(lb.filelistZEWorking,"L002_sortmerna_trimmomatic")
names(lb.filterlistZE2) <- sapply(lapply(strsplit(lb.filterlistZE2,"_"),"[",1:2),paste,collapse="_")
names(lb.filterlistZEuniq) <- sapply(lapply(strsplit(lb.filterlistZEuniq,"_"),"[",1:2),paste,collapse="_")
names(lb.filterlistZE1) <- str_replace(names(lb.filterlistZE1),".*salmon/","")
names(lb.filterlistZE2) <- str_replace(names(lb.filterlistZE2),".*salmon/","")
names(lb.filterlistZEuniq) <- str_replace(names(lb.filterlistZEuniq),".*salmon/","")
part1 <- suppressMessages(round(tximport(files = lb.filterlistZE1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistZE2,
type = "salmon",txOut=TRUE)$counts))
part3 <- suppressMessages(round(tximport(files = lb.filterlistZEuniq,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsZE <- part1 + part2
countsREARRANGE <- cbind(countsZE[,54:55])
countsZE <- countsZE[,-54:-55]
countsZE <- cbind(countsZE, part3, countsREARRANGE)
#################end of ZE
lb.filelistSomaticEmbGerm <- list.files("/mnt/picea/projects/spruce/uegertsdotter/SE-germinants/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
#################start of SomaticEmbGerm
##splitting data in order to sum the counts
lb.filterlistSomaticEmbGerm <- lb.filelistSomaticEmbGerm
lb.filterlistSomaticEmbGerm1 <- (str_subset(lb.filterlistSomaticEmbGerm, "AC7", negate = TRUE))
names(lb.filterlistSomaticEmbGerm1) <- sapply(lapply(strsplit(lb.filterlistSomaticEmbGerm1,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmbGerm2 <- (str_subset(lb.filterlistSomaticEmbGerm, "BC7", negate = TRUE))
names(lb.filterlistSomaticEmbGerm2) <- sapply(lapply(strsplit(lb.filterlistSomaticEmbGerm2,"_"),"[",4:5),paste,collapse="_")
part1 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmbGerm1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmbGerm2,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsEmbGerm <- part1 + part2
#################end of EmbGerm
lb.filelist29Seed <- list.files("/mnt/picea/projects/spruce/uegertsdotter/29_Spruce_Seeds_Project/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
lb.filterlist29Seed <- lb.filelist29Seed
names(lb.filterlist29Seed) <- sapply(lapply(strsplit(lb.filterlist29Seed,"_"),"[",7:8),paste,collapse="_")
counts29Seed <- suppressMessages(round(tximport(files = lb.filterlist29Seed,
type = "salmon",txOut=TRUE)$counts))
###need to recorder the ones with a letter infront of them
lb.filelistSomaticEmb <- list.files("/mnt/picea/projects/spruce/uegertsdotter/22_Somatic_Embryogenesis_Project/salmon",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
lb.filterlistSomaticEmb <- str_subset(lb.filelistSomaticEmb,"L002", negate = TRUE)
#################need to merge countsZE and part3, between column 53 and 54 (to become the new column 54)
lb.filterlistSomaticEmb1 <- str_subset(lb.filelistSomaticEmb,"L002_sortmerna_trimmomatic")
names(lb.filterlistSomaticEmb1) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb1,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmb2 <- str_subset(lb.filelistSomaticEmb,"L002_sortmerna_trimmomatic")
names(lb.filterlistSomaticEmb2) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb2,"_"),"[",4:5),paste,collapse="_")
lb.filterlistSomaticEmb3 <- str_subset(lb.filelistSomaticEmb,"L00", negate = TRUE)
names(lb.filterlistSomaticEmb3) <- sapply(lapply(strsplit(lb.filterlistSomaticEmb3,"_"),"[",7:8),paste,collapse="_")
names(lb.filterlistSomaticEmb1) <- str_replace(names(lb.filterlistSomaticEmb1),".*salmon/","")
names(lb.filterlistSomaticEmb2) <- str_replace(names(lb.filterlistSomaticEmb2),".*salmon/","")
part1 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb1,
type = "salmon",txOut=TRUE)$counts))
part2 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb2,
type = "salmon",txOut=TRUE)$counts))
part3 <- suppressMessages(round(tximport(files = lb.filterlistSomaticEmb3,
type = "salmon",txOut=TRUE)$counts))
all(colnames(part1)==colnames(part2))
## [1] TRUE
countsSomaticEmb <- part1 + part2
countsSomaticEmb <- cbind(part3, countsSomaticEmb)
################# end of SomaticEmb
Filter and select only one duplicate from each sample.
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("P11562_112",NGI.ID))
lb.filterlist <- c(lb.filterlistZE,lb.filterlist29Seed,lb.filterlistSomaticEmbGerm1,lb.filterlistSomaticEmb)
lb.filterlist <- c(lb.filterlistZE,lb.filterlist29Seed,lb.filterlistSomaticEmbGerm1,lb.filterlistSomaticEmb)
#####Filelists have been combined, however not able to proceed - NGI IDs do not match.
#####All samples should have the same Tissue Type in the Somatic and 29Seed - Time is not changed either - they should all be the same.
stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
#print(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
#lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG","S","Normal","Aberrant","Non-EMB","PEM","DKM","SM","LSM","SD","ED","RO","PLS","ZE-R")] ##Are we only looking at ZE?
#lb.filterlist <- lb.filterlist[samples$Experiment %in% c("Zygotic","29Seed","Somatic Embryogenesis Germinants","Somatic Embryogenesis")]
Read the expression at the gene level (there is one transcript per gene)
#lb.g <- suppressMessages(tximport(files = lb.filterlist,
# type = "salmon",txOut=TRUE))
#####redo this lb.g <- cbind(countsZE, countsSEED, countsSomaticEmbGerm, countsSomaticEmb) in the correct order
#####the order is ZE, Seeds, SomaticEmbGerm, SomaticEmb
lb.g <- cbind(countsZE, counts29Seed, countsEmbGerm, countsSomaticEmb)
counts <- lb.g
Check how many genes are never expressed - reasonable level of non-expressed genes indicated.
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "11.1% percent (7331) of 66069 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Experiment,fill=Tissue)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Experiment,fill=)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning in melt(log10(rowMeans(counts))): The melt generic in data.table
## has been passed a numeric and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(log10(rowMeans(counts))). In the next
## version, this warning will become an error.
## Warning: Removed 7331 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>%
mutate(Experiment=samples$Experiment[match(ind,samples$NGI.ID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Tissue)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 3391332 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 3391332 rows containing non-finite values (stat_density).
dir.create(here("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/salmon/4Datasets-unnormalised-gene-expression_data.csv"))
############## change export location name
########################################want to get here to the Data normalisation
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Experiment)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
dds <- dds[,!(dds$NGI.ID == "P11562_148")]
save(dds,file=here("analysis/salmon/4Datasets-dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively variable (0 to 200 %) however the low end of the variance is likely due to poor samples where little sequencing data was actually produce that wasnt 16s bacterial.
dds <- estimateSizeFactors(dds)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes <- sizeFactors(dds)
pander(sizes)
| P11562_101 | P11562_102 | P11562_103 | P11562_104 | P11562_105 | P11562_106 |
|---|---|---|---|---|---|
| 0.8058 | 0.4662 | 0.467 | 0.6478 | 0.5815 | 0.7907 |
| P11562_107 | P11562_108 | P11562_110 | P11562_111 | P11562_113 | P11562_114 |
|---|---|---|---|---|---|
| 0.6775 | 0.5038 | 0.3461 | 1.131 | 1.008 | 0.4456 |
| P11562_115 | P11562_116 | P11562_117 | P11562_118 | P11562_119 | P11562_120 |
|---|---|---|---|---|---|
| 0.7094 | 0.2015 | 0.8516 | 0.3668 | 0.7316 | 0.487 |
| P11562_121 | P11562_123 | P11562_124 | P11562_125 | P11562_126 | P11562_127 |
|---|---|---|---|---|---|
| 0.927 | 0.7981 | 0.2844 | 0.8587 | 0.3855 | 1.104 |
| P11562_128 | P11562_129 | P11562_130 | P11562_131 | P11562_133 | P11562_134 |
|---|---|---|---|---|---|
| 0.7491 | 0.8962 | 0.7644 | 0.8868 | 1.271 | 0.8032 |
| P11562_135 | P11562_136 | P11562_137 | P11562_138 | P11562_139 | P11562_140 |
|---|---|---|---|---|---|
| 1.132 | 0.8747 | 0.7776 | 0.5863 | 0.7706 | 0.7332 |
| P11562_141 | P11562_142 | P11562_143 | P11562_145 | P11562_146 | P11562_147 |
|---|---|---|---|---|---|
| 1.254 | 1.092 | 0.7181 | 1.088 | 0.6238 | 0.7118 |
| P11562_149 | P11562_150 | P11562_151 | P11562_153 | P11562_154 | P11562_155 |
|---|---|---|---|---|---|
| 0.7464 | 1.154 | 0.9141 | 0.4101 | 0.7425 | 1.257 |
| P11562_157 | P11562_201 | P11562_202 | P11562_203 | P11562_204 | P11562_205 |
|---|---|---|---|---|---|
| 0.8647 | 0.5189 | 0.761 | 0.9157 | 0.3734 | 0.7593 |
| P11562_206 | P1790_101 | P1790_102 | P1790_103 | P1790_104 | P1790_105 |
|---|---|---|---|---|---|
| 0.4669 | 0.7054 | 0.6729 | 0.8753 | 0.9004 | 0.9158 |
| P1790_106 | P1790_107 | P1790_108 | P1790_109 | P1790_110 | P1790_111 |
|---|---|---|---|---|---|
| 0.739 | 0.6543 | 0.6032 | 0.6514 | 0.7175 | 0.8266 |
| P1790_112 | P2228_101 | P2228_102 | P2228_103 | P2228_104 | P2228_105 |
|---|---|---|---|---|---|
| 0.8576 | 2.065 | 1.768 | 1.725 | 1.595 | 2.229 |
| P2228_106 | P2228_107 | P2228_108 | P2228_109 | P2228_110 | P2228_111 |
|---|---|---|---|---|---|
| 1.681 | 1.564 | 1.885 | 1.489 | 1.39 | 1.73 |
| P2228_112 | P2228_113 | P2228_114 | P2228_115 | P2228_117 | P2228_118 |
|---|---|---|---|---|---|
| 2.027 | 1.509 | 1.443 | 1.983 | 1.619 | 1.65 |
| P2228_119 | P2228_120 | P464_202 | P464_203 | P464_204 | P464_205 | P464_206 |
|---|---|---|---|---|---|---|
| 1.742 | 1.213 | 1.37 | 1.327 | 1.274 | 0.9179 | 1.241 |
| P464_208 | P464_211 | P464_201B | P464_209B | P464_207B | P7614_301 |
|---|---|---|---|---|---|
| 1.098 | 1.481 | 0.6571 | 0.4765 | 1.108 | 1.803 |
| P7614_302 | P7614_303 | P7614_304 | P7614_305 | P7614_306 | P7614_307 |
|---|---|---|---|---|---|
| 1.785 | 1.983 | 1.89 | 1.998 | 2.385 | 1.952 |
| P7614_308 | P7614_309 | P7614_310 | P7614_311 | P7614_312 | P7614_313 |
|---|---|---|---|---|---|
| 2.085 | 1.656 | 1.496 | 1.653 | 1.52 | 1.875 |
| P7614_314 | P7614_315 | P7614_316 | P7614_317 | P7614_318 | P7614_319 |
|---|---|---|---|---|---|
| 1.743 | 2.034 | 1.92 | 1.671 | 1.866 | 1.692 |
| P7614_320 | P7614_321 | P7614_322 | P7614_323 | P7614_324 |
|---|---|---|---|---|
| 1.651 | 1.967 | 1.885 | 1.745 | 1.754 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked very well, the data is nearly homoscesdastic
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples.
#mar=c(5.1,4.1,4.1,2.1)
#scatterplot3d(pc$x[,1],
# pc$x[,2],
# pc$x[,3],
# xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
# ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
# zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
# color=pal[as.integer(samples$Tissue)],
# pch=c(17,19,15,13)[as.integer(samples$Experiment)])
#legend("topleft",
# fill=pal[1:nlevels(samples$Tissue)],
# legend=levels(samples$Tissue))
#legend("bottomright",
# pch=c(17,19,15,13),
# legend=levels(samples$Experiment))
#par(mar=mar)
samples$NGI.ID
## [1] "P11562_101" "P11562_102" "P11562_103" "P11562_104" "P11562_105"
## [6] "P11562_106" "P11562_107" "P11562_108" "P11562_110" "P11562_111"
## [11] "P11562_113" "P11562_114" "P11562_115" "P11562_116" "P11562_117"
## [16] "P11562_118" "P11562_119" "P11562_120" "P11562_121" "P11562_123"
## [21] "P11562_124" "P11562_125" "P11562_126" "P11562_127" "P11562_128"
## [26] "P11562_129" "P11562_130" "P11562_131" "P11562_133" "P11562_134"
## [31] "P11562_135" "P11562_136" "P11562_137" "P11562_138" "P11562_139"
## [36] "P11562_140" "P11562_141" "P11562_142" "P11562_143" "P11562_145"
## [41] "P11562_146" "P11562_147" "P11562_148" "P11562_149" "P11562_150"
## [46] "P11562_151" "P11562_153" "P11562_154" "P11562_155" "P11562_157"
## [51] "P11562_201" "P11562_202" "P11562_203" "P11562_204" "P11562_205"
## [56] "P11562_206" "P1790_101" "P1790_102" "P1790_103" "P1790_104"
## [61] "P1790_105" "P1790_106" "P1790_107" "P1790_108" "P1790_109"
## [66] "P1790_110" "P1790_111" "P1790_112" "P2228_101" "P2228_102"
## [71] "P2228_103" "P2228_104" "P2228_105" "P2228_106" "P2228_107"
## [76] "P2228_108" "P2228_109" "P2228_110" "P2228_111" "P2228_112"
## [81] "P2228_113" "P2228_114" "P2228_115" "P2228_117" "P2228_118"
## [86] "P2228_119" "P2228_120" "P464_202" "P464_203" "P464_204"
## [91] "P464_205" "P464_206" "P464_208" "P464_211" "P464_201B"
## [96] "P464_209B" "P464_207B" "P7614_301" "P7614_302" "P7614_303"
## [101] "P7614_304" "P7614_305" "P7614_306" "P7614_307" "P7614_308"
## [106] "P7614_309" "P7614_310" "P7614_311" "P7614_312" "P7614_313"
## [111] "P7614_314" "P7614_315" "P7614_316" "P7614_317" "P7614_318"
## [116] "P7614_319" "P7614_320" "P7614_321" "P7614_322" "P7614_323"
## [121] "P7614_324"
samples <- samples[!(samples$NGI.ID == "P11562_148"),]
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Tissue,shape=Experiment)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
suppressPackageStartupMessages(library(plotly))
interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Tissue,shape=Experiment,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?
conds <- factor(paste(samples$Tissue,samples$Experiment))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
vstCutoff <- 6+1
hm <- heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal,cexCol = .2)
plot(as.hclust(hm$colDendrogram))
Perform the VST, aware
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
A cutoff at 1 VST seems appropriate for both
sels <- rangeFeatureSelect(
counts=as.matrix(vst),
conditions=colData(dds)$Experiment,nrep=3)
# check and decide on cutoff
cutoff=4
sel <- sels[[cutoff]]
dir.create(here("data/seidr"),showWarnings=FALSE,recursive=TRUE)
#' * gene by column, without names matrix
write.table(t(vst[sel,]),
file=here("data/seidr/headless.tsv"),
col.names=FALSE,
row.names=FALSE,
sep="\t",quote=FALSE)
#' * gene names, one row
# TODO check IDs
write.table(t(sub("\\.1$","",rownames(vst)[sel])),
file=here("data/seidr/genes.tsv"),
col.names=FALSE,
row.names=FALSE,
sep="\t",quote=FALSE)
The data appears to show a correlation between Tissue and Time, with some clustering in the PCA plot showing a movement of time in ascending order from right to left (+ to -) along the X axis. There can also be seen that along the Y axis, there are subgroups which correlate to the Tissue type. Along with this, it appears that overall gene expression does not appear starkly different but slight differences can be observed between Tissue types. It appears that the most difference between Tissues can be seen in the 1000 most variable genes, with some small changes between Time points within those Tissue groups. The final heat map of the 1000 least variable genes appear to show a very mixed expression pattern, similar in appearance to the total gene heat map, between Tissue groups and Time points, except for in the Somatic tissue type.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] here_0.1 vsn_3.52.0
## [3] tximport_1.12.3 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.3
## [7] purrr_0.3.2 readr_1.3.1
## [9] tidyr_1.0.0 tibble_2.1.3
## [11] tidyverse_1.2.1 scatterplot3d_0.3-41
## [13] RColorBrewer_1.1-2 plotly_4.9.0
## [15] pander_0.6.3 hyperSpec_0.99-20180627
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] gplots_3.0.1.1 DESeq2_1.24.0
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [23] BiocParallel_1.18.1 matrixStats_0.55.0
## [25] Biobase_2.44.0 GenomicRanges_1.36.1
## [27] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [29] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [31] data.table_1.12.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.2
## [4] XVector_0.24.0 base64enc_0.1-3 rstudioapi_0.10
## [7] hexbin_1.27.3 affyio_1.54.0 bit64_0.9-7
## [10] AnnotationDbi_1.46.1 lubridate_1.7.4 xml2_1.2.2
## [13] splines_3.6.1 geneplotter_1.62.0 knitr_1.25
## [16] zeallot_0.1.0 Formula_1.2-3 jsonlite_1.6
## [19] broom_0.5.2 annotate_1.62.0 cluster_2.1.0
## [22] shiny_1.3.2 BiocManager_1.30.4 compiler_3.6.1
## [25] httr_1.4.1 backports_1.1.5 assertthat_0.2.1
## [28] Matrix_1.2-17 lazyeval_0.2.2 limma_3.40.6
## [31] cli_1.1.0 later_1.0.0 acepack_1.4.1
## [34] htmltools_0.3.6 tools_3.6.1 affy_1.62.0
## [37] gtable_0.3.0 glue_1.3.1 GenomeInfoDbData_1.2.1
## [40] reshape2_1.4.3 Rcpp_1.0.2 cellranger_1.1.0
## [43] vctrs_0.2.0 preprocessCore_1.46.0 gdata_2.18.0
## [46] nlme_3.1-141 crosstalk_1.0.0 xfun_0.10
## [49] testthat_2.2.1 rvest_0.3.4 mime_0.7
## [52] lifecycle_0.1.0 gtools_3.8.1 XML_3.98-1.20
## [55] zlibbioc_1.30.0 scales_1.0.0 promises_1.0.1
## [58] hms_0.5.1 yaml_2.2.0 memoise_1.1.0
## [61] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-28
## [64] stringi_1.4.3 RSQLite_2.1.2 highr_0.8
## [67] genefilter_1.66.0 checkmate_1.9.4 caTools_1.17.1.2
## [70] rlang_0.4.0 pkgconfig_2.0.3 bitops_1.0-6
## [73] evaluate_0.14 labeling_0.3 htmlwidgets_1.3
## [76] bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
## [79] magrittr_1.5 R6_2.4.0 generics_0.0.2
## [82] Hmisc_4.2-0 DBI_1.0.0 pillar_1.4.2
## [85] haven_2.1.1 foreign_0.8-72 withr_2.1.2
## [88] survival_2.44-1.1 RCurl_1.95-4.12 nnet_7.3-12
## [91] modelr_0.1.5 crayon_1.3.4 KernSmooth_2.23-15
## [94] rmarkdown_1.16 locfit_1.5-9.1 readxl_1.3.1
## [97] blob_1.2.0 digest_0.6.21 xtable_1.8-4
## [100] httpuv_1.5.2 munsell_0.5.0 viridisLite_0.3.0